Before you start

Set my seed

# Any number can be chosen
set.seed(3141596)

Goals of this file

  1. Use raw fastq files and generate quality plots to assess quality of reads.
  2. Filter and trim out bad sequences and bases from our sequencing files.
  3. Write out fastq files with high quality sequences.
  4. Evaluate the quality from our filter and trim.
  5. Infer errors on forward and reverse reads individually.
  6. Identified ASVs on forward and reverse reads seperately using the error model.
  7. Merge forward and reverse ASVs into “contiguous ASVs”.
  8. Generate the ASV count table (otu_table input for phyloseq).
  9. Remove chimeras.
  10. Track the read counts
  11. Assign taxonomy.
  12. Prepare the data for export!

Output that we need:

  1. ASV Count Table: otu_table
  2. Taxonomy Table: tax_table
  3. Sample Information: sample_data - track the reads lost throughout the DADA2 workflow

Load Libraries

# Efficient package loading with pacman 
pacman::p_load(tidyverse, BiocManager, devtools, dada2, 
               phyloseq, patchwork, DT, iNEXT, vegan,
               install = FALSE)

Load Data

# Set the raw fastq path to the raw sequencing files 
# Path to the fastq files 
raw_fastqs_path <- "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs"
raw_fastqs_path
## [1] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs"
# What files are in this path? Intuition Check 
head(list.files(raw_fastqs_path))
## [1] "ERR11588428_1.fastq.gz" "ERR11588428_2.fastq.gz" "ERR11588429_1.fastq.gz"
## [4] "ERR11588429_2.fastq.gz" "ERR11588430_1.fastq.gz" "ERR11588430_2.fastq.gz"
# How many files are there? 
str(list.files(raw_fastqs_path))
##  chr [1:20] "ERR11588428_1.fastq.gz" "ERR11588428_2.fastq.gz" ...
# Create vector of forward reads
forward_reads <- list.files(raw_fastqs_path, pattern = "1.fastq.gz", full.names = TRUE)  
# Intuition Check 
head(forward_reads)  
## [1] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588428_1.fastq.gz"
## [2] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588429_1.fastq.gz"
## [3] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588430_1.fastq.gz"
## [4] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588431_1.fastq.gz"
## [5] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588432_1.fastq.gz"
## [6] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588433_1.fastq.gz"
# Create a vector of reverse reads 
reverse_reads <- list.files(raw_fastqs_path, pattern = "2.fastq.gz", full.names = TRUE)
head(reverse_reads)
## [1] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588428_2.fastq.gz"
## [2] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588429_2.fastq.gz"
## [3] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588430_2.fastq.gz"
## [4] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588431_2.fastq.gz"
## [5] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588432_2.fastq.gz"
## [6] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/01_raw_gzipped_fastqs/ERR11588433_2.fastq.gz"

Assess Raw Read Quality

Evaluate raw sequence quality

Let’s see the quality of the raw reads before we trim

Plot 12 random samples of plots

# Randomly select 2 samples from dataset to evaluate 
random_samples <- sample(1:length(reverse_reads), size = 2)
random_samples
## [1] 9 4
# Calculate and plot quality of these two samples
forward_filteredQual_plot_2 <- plotQualityProfile(forward_reads[random_samples]) + 
  labs(title = "Forward Read: Raw Quality")
# Show the plot
forward_filteredQual_plot_2

reverse_filteredQual_plot_2 <- plotQualityProfile(reverse_reads[random_samples]) + 
  labs(title = "Reverse Read: Raw Quality")
# Show the plot
reverse_filteredQual_plot_2

# Plot them together with patchwork
forward_filteredQual_plot_2 + reverse_filteredQual_plot_2

# When I try to run this I get the following error: Error in Ops.data.frame(guide_loc, panel_loc) : ‘==’ only defined for equally-sized data frames

Aggregated Raw Quality Plots

# Aggregate all QC plots 
# Forward reads
forward_preQC_plot <- 
  plotQualityProfile(forward_reads, aggregate = TRUE) + 
  labs(title = "Forward Pre-QC")
# Show the plot
forward_preQC_plot

# reverse reads
reverse_preQC_plot <- 
  plotQualityProfile(reverse_reads, aggregate = TRUE) + 
  labs(title = "Reverse Pre-QC")
# Show the plot
reverse_preQC_plot

preQC_aggregate_plot <- forward_preQC_plot + reverse_preQC_plot
  # Plot the forward and reverse together 

# When I try to run this I get the following error: Error in Ops.data.frame(guide_loc, panel_loc) : ‘==’ only defined for equally-sized data frames

# Show the plot
preQC_aggregate_plot

Here, we see that the plots are showing the standard Illumina output: The quality is higher at the beginning of the read and slowly gets worse and worse as the read progresses. This is typical of Illumina sequencing because of phasing. The forward reads actually look a bit more low quality than the reverse reads.

Prepare a placeholder for filtered reads

# vector of our samples, extract sample name from files 
samples <- sapply(strsplit(basename(forward_reads), "_"), `[`,1) 
# Intuition Check 
head(samples)
## [1] "ERR11588428" "ERR11588429" "ERR11588430" "ERR11588431" "ERR11588432"
## [6] "ERR11588433"
# Place filtered reads into filtered_fastqs_path
filtered_fastqs_path <- "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs"
filtered_fastqs_path
## [1] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs"
# create 2 variables: filtered_F, filtered_R
filtered_forward_reads <- 
  file.path(filtered_fastqs_path, paste0(samples, "_R1_filtered.fastq.gz"))
length(filtered_forward_reads)
## [1] 10
head(filtered_forward_reads)
## [1] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588428_R1_filtered.fastq.gz"
## [2] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588429_R1_filtered.fastq.gz"
## [3] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588430_R1_filtered.fastq.gz"
## [4] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588431_R1_filtered.fastq.gz"
## [5] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588432_R1_filtered.fastq.gz"
## [6] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588433_R1_filtered.fastq.gz"
# reverse reads
filtered_reverse_reads <- 
  file.path(filtered_fastqs_path, paste0(samples, "_R2_filtered.fastq.gz"))
length(filtered_reverse_reads)
## [1] 10
head(filtered_reverse_reads)
## [1] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588428_R2_filtered.fastq.gz"
## [2] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588429_R2_filtered.fastq.gz"
## [3] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588430_R2_filtered.fastq.gz"
## [4] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588431_R2_filtered.fastq.gz"
## [5] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588432_R2_filtered.fastq.gz"
## [6] "/local/workdir/zlr6/git_repos/biomi6300_moonmilk_amplicon_analysis/data/01_DADA2/02_filtered_fastqs/ERR11588433_R2_filtered.fastq.gz"

Filter and Trim Reads

Parameters of filter and trim DEPEND ON THE DATASET. The things to keep in mind are:
- The library preparation: Are the primers included in the sequence? If so, they need to be trimmed out in this step.
- What do the above quality profiles of the reads look like? If they are lower quality, it is highly recommended to use maxEE = c(1,1).
- Do the reads dip suddenly in their quality? If so, explore trimLeft and truncLen

Check out more of the parameters using ?filterAndTrim to bring up the help page and do some googling about it. Some notes on two examples are below, with a description of a few of the parameters:

Moonmilk Dataset: This dataset was generated with the library preparation described by Theodorescu et al., 2023 Microbial Ecology, the reads maintained high Phred Scores (above 30, even more typically above ~34) until aroud 200bps. Therefore, we will use a stringent maxEE = c(1,1) and truncate the reads at 250bps.

  • maxEE is a quality filtering threshold applied to expected errors. Here, if there’s 2 expected errors. It’s ok. But more than 2. Throw away the sequence. Two values, first is for forward reads; second is for reverse reads.
  • trimLeft can be used to remove the beginning bases of a read (e.g. to trim out primers!)
  • truncLen can be used to trim your sequences after a specific base pair when the quality gets lower. Though, please note that this will shorten the ASVs! For example, this can be used when the quality of the sequence suddenly gets lower, or clearly is typically lower. So, if the quality of the read drops below a phred score of 25 (on the y-axis of the plotQualityProfile above, which indicates ~99.5% confidence per base).
  • maxN the number of N bases. Here, using ASVs, we should ALWAYS remove all Ns from the data.
# Assign a vector to filtered reads 
# trim out poor bases, first 3 bps on F reads
# write out filtered fastq files 
filtered_reads <- 
  filterAndTrim(fwd = forward_reads, filt = filtered_forward_reads,
              rev = reverse_reads, filt.rev = filtered_reverse_reads,
              maxN = 0, maxEE = c(1,1),
              truncLen = 250, rm.phix = TRUE, compress = TRUE, multithread = TRUE)

Assess Trimmed Read Quality

# Plot the 12 random samples after QC
forward_filteredQual_plot_12 <- 
  plotQualityProfile(filtered_forward_reads[random_samples]) + 
  labs(title = "Trimmed Forward Read Quality")
# Show the plot
forward_filteredQual_plot_12

reverse_filteredQual_plot_12 <- 
  plotQualityProfile(filtered_reverse_reads[random_samples]) + 
  labs(title = "Trimmed Reverse Read Quality")
# Show the plot
reverse_filteredQual_plot_12

# Put the two plots together 
# forward_filteredQual_plot_12 + reverse_filteredQual_plot_12
# When I try to run this I get the following error: Error in Ops.data.frame(guide_loc, panel_loc) : ‘==’ only defined for equally-sized data frames

Aggregated Trimmed Plots

# Aggregate all QC plots 
# Forward reads
forward_postQC_plot <- 
  plotQualityProfile(filtered_forward_reads, aggregate = TRUE) + 
  labs(title = "Forward Post-QC")
# Show the plot
forward_postQC_plot

# reverse reads
reverse_postQC_plot <- 
  plotQualityProfile(filtered_reverse_reads, aggregate = TRUE) + 
  labs(title = "Reverse Post-QC")
# Show the plot
reverse_postQC_plot

#postQC_aggregate_plot <- forward_postQC_plot + reverse_postQC_plot
# Show the plot
#postQC_aggregate_plot
# When I try to run this I get the following error: Error in Ops.data.frame(guide_loc, panel_loc) : ‘==’ only defined for equally-sized data frames

The quality of the sequences look much better. It’s even more obvious that the reverse reads are higher quality for some reason, but even so the quality of the forward reads is fine.

Stats on read output from filterAndTrim

# Make output into dataframe 
filtered_df <- as.data.frame(filtered_reads)
head(filtered_df)
##                        reads.in reads.out
## ERR11588428_1.fastq.gz    96845     58385
## ERR11588429_1.fastq.gz   101519     58720
## ERR11588430_1.fastq.gz    95676     57401
## ERR11588431_1.fastq.gz    90880     55342
## ERR11588432_1.fastq.gz    85608     53582
## ERR11588433_1.fastq.gz    73835     45215
# calculate some stats 
filtered_df %>%
  reframe(median_reads_in = median(reads.in),
          median_reads_out = median(reads.out),
          median_percent_retained = (median(reads.out)/median(reads.in)))
##   median_reads_in median_reads_out median_percent_retained
## 1         84523.5          49398.5               0.5844351

we retained 58.4% of our reads. This should be “enough”. I think our filterAndTrim() parameters are alright for this situations.

Visualize QC differences in plot

# Plot the pre and post together in one plot
#preQC_aggregate_plot / postQC_aggregate_plot
# When I try to run this I get the following error: Error in Ops.data.frame(guide_loc, panel_loc) : ‘==’ only defined for equally-sized data frames

Error Modelling

Learn the errors

# Forward reads 
error_forward_reads <- 
  learnErrors(filtered_forward_reads, multithread = TRUE)
## 101085500 total bases in 404342 reads from 8 samples will be used for learning the error rates.
# Plot Forward  
forward_error_plot <- 
  plotErrors(error_forward_reads, nominalQ = TRUE) + 
  labs(title = "Forward Read Error Model")
# Show the plot
forward_error_plot
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

# Reverse reads 
error_reverse_reads <- 
  learnErrors(filtered_reverse_reads, multithread = TRUE)
## 101085500 total bases in 404342 reads from 8 samples will be used for learning the error rates.
# Plot reverse
reverse_error_plot <- 
  plotErrors(error_reverse_reads, nominalQ = TRUE) + 
  labs(title = "Reverse Read Error Model")
# Show the plot
reverse_error_plot
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

# Put the two plots together
# forward_error_plot + reverse_error_plot
# When I try to run this I get the following error: Error in Ops.data.frame(guide_loc, panel_loc) : ‘==’ only defined for equally-sized data frames
  • The error rates for each possible transition (A→C, A→G, …) are shown in the plot above.

Details of the plot: - Points: The observed error rates for each consensus quality score.
- Black line: Estimated error rates after convergence of the machine-learning algorithm.
- Red line: The error rates expected under the nominal definition of the Q-score.

Similar to what is mentioned in the dada2 tutorial: the estimated error rates (black line) are a “reasonably good” fit to the observed rates (points), and the error rates drop slowly with increased quality as expected. We can now infer ASVs!

Infer ASVs

An important note: This process occurs separately on forward and reverse reads! This is quite a different approach from how OTUs are identified in Mothur and also from UCHIME, oligotyping, and other OTU, MED, and ASV approaches.

# Infer ASVs on the forward sequences
dada_forward <- dada(filtered_forward_reads,
                     err = error_forward_reads, 
                     multithread = TRUE)
## Sample 1 - 58385 reads in 27900 unique sequences.
## Sample 2 - 58720 reads in 30013 unique sequences.
## Sample 3 - 57401 reads in 28695 unique sequences.
## Sample 4 - 55342 reads in 7183 unique sequences.
## Sample 5 - 53582 reads in 7981 unique sequences.
## Sample 6 - 45215 reads in 7895 unique sequences.
## Sample 7 - 36726 reads in 11732 unique sequences.
## Sample 8 - 38971 reads in 12236 unique sequences.
## Sample 9 - 35047 reads in 13799 unique sequences.
## Sample 10 - 33571 reads in 6950 unique sequences.
typeof(dada_forward)
## [1] "list"
# Grab a sample and look at it 
dada_forward$`ERR11588428_R1_filtered.fastq.gz`
## dada-class: object describing DADA2 denoising results
## 1113 sequence variants were inferred from 27900 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
# Infer ASVs on the reverse sequences 
dada_reverse <- dada(filtered_reverse_reads,
                     err = error_reverse_reads,
                     multithread = TRUE)
## Sample 1 - 58385 reads in 24541 unique sequences.
## Sample 2 - 58720 reads in 26623 unique sequences.
## Sample 3 - 57401 reads in 25829 unique sequences.
## Sample 4 - 55342 reads in 9593 unique sequences.
## Sample 5 - 53582 reads in 9543 unique sequences.
## Sample 6 - 45215 reads in 5843 unique sequences.
## Sample 7 - 36726 reads in 8137 unique sequences.
## Sample 8 - 38971 reads in 8598 unique sequences.
## Sample 9 - 35047 reads in 15273 unique sequences.
## Sample 10 - 33571 reads in 9235 unique sequences.
# Inspect 
dada_reverse[1]
## $ERR11588428_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 1124 sequence variants were inferred from 24541 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
dada_reverse[10]
## $ERR11588437_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 475 sequence variants were inferred from 9235 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

Merge Forward & Reverse ASVs

Now, merge the forward and reverse ASVs into contigs.

# merge forward and reverse ASVs
merged_ASVs <- mergePairs(dada_forward, filtered_forward_reads, 
                          dada_reverse, filtered_reverse_reads,
                          verbose = TRUE)
## 40228 paired-reads (in 3022 unique pairings) successfully merged out of 50530 (in 11364 pairings) input.
## 37668 paired-reads (in 2880 unique pairings) successfully merged out of 49375 (in 12412 pairings) input.
## 37461 paired-reads (in 2908 unique pairings) successfully merged out of 48706 (in 12020 pairings) input.
## 53845 paired-reads (in 1527 unique pairings) successfully merged out of 54153 (in 1634 pairings) input.
## 52115 paired-reads (in 1726 unique pairings) successfully merged out of 52404 (in 1823 pairings) input.
## 43966 paired-reads (in 1061 unique pairings) successfully merged out of 44432 (in 1144 pairings) input.
## 34647 paired-reads (in 1583 unique pairings) successfully merged out of 35319 (in 1837 pairings) input.
## 36933 paired-reads (in 1550 unique pairings) successfully merged out of 37498 (in 1853 pairings) input.
## 26496 paired-reads (in 2689 unique pairings) successfully merged out of 31199 (in 5866 pairings) input.
## 30449 paired-reads (in 1202 unique pairings) successfully merged out of 31272 (in 1473 pairings) input.
# Evaluate the output 
typeof(merged_ASVs)
## [1] "list"
length(merged_ASVs)
## [1] 10
names(merged_ASVs)
##  [1] "ERR11588428_R1_filtered.fastq.gz" "ERR11588429_R1_filtered.fastq.gz"
##  [3] "ERR11588430_R1_filtered.fastq.gz" "ERR11588431_R1_filtered.fastq.gz"
##  [5] "ERR11588432_R1_filtered.fastq.gz" "ERR11588433_R1_filtered.fastq.gz"
##  [7] "ERR11588434_R1_filtered.fastq.gz" "ERR11588435_R1_filtered.fastq.gz"
##  [9] "ERR11588436_R1_filtered.fastq.gz" "ERR11588437_R1_filtered.fastq.gz"
# Inspect the merger data.frame from the 20210602-MA-ABB1P 
head(merged_ASVs[[3]])
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                            sequence
## 1 GACTACCGGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGCGCCTCAGTGTCAGTATCTGTCCAGGTAGCCGCCTTCGCCACTGGTGTTCCTTCCGATCTCTACGCATTTCACCGCTACACCGGAAATTCCACTACCCTCTACAGTACTCTAGTACATCAGTATAAGTTGCACCTCCCAGGTTAAGCCCAGGTCTTTCACAACTCACTTAATGCACCACCTACGCGCCCTTTACGCCCAGTAACTCCGATTAACGCTTGCACCCTCTGTATTACCGCGGCTGCTGGCACAGAGTTAGCCGGTGCTTATTCTGTTGGTACAATCAAATGTATCATCTCTTAAACTATACATCTTTTCCCCAACCTAAAGTGCTTTACAACCCGAAGGCCTTCTTCACACACGCGGTATTGCTGGATCAGGGTTGCCCCCATTGTCCAATATTCCCCACTGCTGCCCCCCGTAGG
## 2 GACTACCGGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGCGCCTCAGTGTCAGGTGTAGGTTAGAAAACCGCCTTCGCCACCGGTGTTCTTCCACATATCTACGCATTTCACCGCTACATGTGGAATTCCGTTTTCCCCACCTATCCTCTAGATTAGAAGTTTCAGATGCCGCTCCGAAGTTGAGCCCCGGAATTTCACATCTGACTTTCCAAACCACCTACGCGCCCTTTACGCCCAATAAATCCGATTAATGCTTGCACCCTCCGTATTACCGCAGCTGCTGGCACGGAGTTAGCCGGTGCTTCTTTACCTGGTACCCTCAAATTAGCGGATTATTCACCCGCTTTTCTTGTTTCCAAGCGAAAGAGCTTTACAACCCGAAGGCCTTCTTCGCTCACACGGCGTCGCTTCGTCAGGCTTGCGCCCATTGCGAAAGATCCTCGACTGCAGCCCCCCGTAGG
## 3                 GACTACCGGGGTATCTAATCCCGTTTGCTACCCTAGCTTTCGCGCCTCAGCGTCAGGAGAGGTCCAGCGACGCGCTTTCGCCACCGGCGTTCCTACCAATATCAACGCATTTCACCGCTCCACTGGTAGTTCCCGTCGCCCCTACCTCCCTCTAGCCCGCCAGTATCCAGGGCAGTCTTCCGGTTGAGCCGAAAGATTTCACCCTGAACTTAACGAACTGCCTACGCGCCCTTTAAGCCCAGTGATTCCGAACAACGTTCGCACGGTTCGTCTTACCGCGGCTGCTGGCACGAACTTAGCCCGTGCTTCCTCCAGGGATAGGTCAGACCTTGCGGCTTTCCTCCCCCTCGACAGTGGTTTACAACCCGAGGGCCTTCATCCCACACGCGGCGTCGCTCGGTCAAGCTTGCGCTCATTGCCGAAGATCCTCGACTGCAGCCCCCCGTAGG
## 4                          GACTACCGGGGTATCTAATCCCGTTTGCTCCCCTGGCTTTCGCGCCTCAGCGTCAGTGTCAGCCCAGCAACCCGTCTTCACCTCAGGTGTTCCTCTTGATATCTACGCATTTCACCGCTACACCAAGAATTCCGATTGCCCCTTCTGCACTCTAGCGCGACAGTATCACTTGGCCGTTCTGGGTTAAGCCCAGAGATTTCACAAGTGACTTGTCATGCCGCCTACGCGCCCTTTACGCCCAGTAAATCCGAACAACGCTTGGTCCCTACGTATTACCGCGGCTGCTGGCACGTAGTTAGCCGGACCTTATAAATAGTACCGTCATTTATTCTTCCTATTCTTTCGAAGTTTACATACCGAAATACTTCATCCTTCACGCGGCGTTGCTGGGTCAGGGTTTCCCCCATTGCCCAAAATTCCCGACTGCTGCCCCCCGTAGG
## 5                          GACTACCGGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGCGCCTCAGCGTCAGTTGCGAGCCAGAAAGCCGCCTTCGCCTCTGGTGTTCTTCCTAATATCTACGAATTTCACCTCTACACTAGGAATTCCACTTTCCTCTCTCGCACTCTAGCATTCCAGTATGAAACGCACCTCCCGGGTTAAGCCCGGGGCTTTCACGCCTCACTTAAAATACCGCCTACGCGCCCTTTACGCCCAGTCATTCCGAACAACGCTAGCCCCCTCCGTCTTACCGCGGCTGCTGGCACGGAGTTTGCCGGGGCTTCTTCTCCTGCTACCGTCATTATCTTCACAGGTGAAAGAACTTTACAACCCTAAGGCCTTCTTCATTCACGCGGCATTGCTGGATCAGGGTTTCCCCCATTGTCCAATATTCCCCACTGCTGCCCCCCGTAGG
## 6 GACTACCGGGGTATCTAATCCTGTTTGATCCCCACGCTTTCGCGCCTCAGCGTCAGTATTGGTCCAGGAAGCCGCCTTCGCCACTGGTGTTCCTCCGGATATCTACGCATTTCACCGCTACACCCGGAATTCCGCTTCCCTCTACCATACTCTAGCCAGGCAGTATCGAATGCAATTCCCAGGTTGAGCCCGGGGCTTTCACACCCGACTTACCAAACCGCCTACGCGCCCTTTACGCCCAGTAATTCCGATTAACGCTCGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCTTCTGTAAGTAACGTCAAGACCGAGTGATATTAGCACTCGGCTTTTCTTCCCTACTGAAAGTGCTTTACAACCCGCAGGCCTTCTTCACACACGCGGCATCGCTGGATCAGGCTTGCGCCCATTGTCCAATATTCCCCACTGCTGCCCCCCGTAGG
##   abundance forward reverse nmatch nmismatch nindel prefer accept
## 1       100      57     100     35         0      0      2   TRUE
## 2        97      67      53     35         0      0      2   TRUE
## 3        97      88      69     51         0      0      2   TRUE
## 4        96     312     557     60         0      0      1   TRUE
## 5        93      62      58     60         0      0      2   TRUE
## 6        93      28      66     35         0      0      1   TRUE

Create Raw ASV Count Table

# Create the ASV Count Table 
raw_ASV_table <- makeSequenceTable(merged_ASVs)
# Write out the file to data/01_DADA2


# Check the type and dimensions of the data
dim(raw_ASV_table)
## [1]    10 15863
class(raw_ASV_table)
## [1] "matrix" "array"
typeof(raw_ASV_table)
## [1] "integer"
# Inspect the distribution of sequence lengths of all ASVs in dataset 
table(nchar(getSequences(raw_ASV_table)))
## 
##  256  271  272  274  275  290  291  292  294  295  296  297  298  299  372  405 
##    1    2   10    1    5   14  101   15    3   18    6   12   37    2    2    1 
##  409  411  412  416  417  418  419  422  423  425  428  429  432  433  434  435 
##    1    2    3    1    9    1    3    7    4   45    2   62   23   17   44  117 
##  436  437  438  439  440  441  442  443  444  445  446  447  448  449  450  451 
##    3    1   72   28 1557 1023  369  180   54 1574  268  254   12  477    7  153 
##  452  453  454  455  456  457  458  459  460  461  462  463  464  465  466  467 
##   75  140   73  225   24  278   52  113  439   47   58  104  343 5016 2153   95 
##  468  469  476  477  479  480  481  486 
##    4    4    2    7    3    3    1    1
# Inspect the distribution of sequence lengths of all ASVs in dataset 
# AFTER TRIM
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table))) %>%
  ggplot(aes(x = Seq_Length )) + 
  geom_histogram() + 
  labs(title = "Raw distribution of ASV length")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## We're stuck here.